## The Role of Case Management in Misdemeanor Prosecution
## Lindsay Graef and Aurelie Ouss
##
## Gets numbers for Figure 1 on flow of misdemeanor cases

library(tidyverse)
library(tidytable)
library(daotools)
library(arrow)
library(lubridate)
library(dplyr)

# Functions---------------------------------------------------------------------

## Get Cleaned Charges
# gets all charges from Jan 1 2011 to March 1, 2020
getChargesCleaned <- function(){
  
  load_arrests()
  
  cc <- get_complaint_charge_df() %>%
    mutate(grade = str_trim(gsub('[0-9]+','',charge_grade))) %>%
    # Identify if ALL charges are declined for arrest, and if most serious charge is misdemeanor
    mutate(declined = ifelse(all(is_charge_declined == TRUE),1,0),
           cc_lead_charge_arrest = case_when(any(grepl("F|H", grade)) ~ "F",
                                             any(grepl("M", grade)) ~ "M",
                                             TRUE ~ "S"),
           .by = arrest_id) %>%
    # also get lead charge at charging (for example, may be 2 M counts and one S count, but we only charge the S)
    mutate(cc_lead_charge_charging = case_when(any(grade %in% c("F","H") & is_charge_declined == FALSE) ~ "F",
                                               any(grade == "M" & is_charge_declined == FALSE) ~ "M",
                                               TRUE ~ "S"),
           .by = arrest_id) %>%
    # merge in arrest date from PARS
    left_join(., pars_arrests %>% select(arrest_id, dc_number, arrest_date, defendant_pid, defendant_age),
              by = c("arrest_id", "dc_number")) %>%
    mutate(rep_create_date = as.Date(rep_create_date, tz = "EDT")) %>%
    # filter to adult arrests made between Jan 1 2011 and March 1, 2020
    filter(rep_create_date > "2011-01-01" & rep_create_date < "2020-03-01" & 
             (defendant_age >= 18 | is.na(defendant_age)) ) %>%
    rename(arrest_date_pars = arrest_date)
  
  return(cc)
}


## Collapse Docket Entries and Deduplicate
# collapses docket entry text on same date in the same courtroom for a given docket
# also collapses official docket entries in same manner
collapseDocketEntries <- function(df){
  
  # create id_dlc variable to get unique docket_number-listing-courtrooms
  df <- df %>%
    mutate(id_dlc = paste(docket_number,listing_date,court_room_num), 
            .by = c(docket_number, listing_date,court_room_num)) 
  
  # collapse "official docket entry" column into one (paste together for each listing)
  df <- df %>%
    mutate(official_docket_entry_collapse = paste(official_docket_entry,collapse = " | "),
            docket_entry_comments_collapse = paste(docket_entry_comments,collapse = " | "),
            .by = id_dlc) %>%
    # remove duplicates
    select(-c(official_docket_entry,docket_entry_comments)) %>%
    distinct(id_dlc, .keep_all = TRUE)
  
  return(df)
}


## Mark Cases Diverted at Charging
# identifies MC cases that were diverted at charging
markChargingDiversions <- function(df){
  
  # If the first hearing after arraignment was "Progress Listing", or
  # "TC Status", etc. case was likely diverted at charging and goes directly to program.
  # If it has a 404 hearing first then diversion hearing, also counts as diverted from charging.
  # Otherwise, if there is a trial hearing scheduled first, it was diverted by MC ADA.
  diverted <- df %>%
    # get misdemeanor diversion cases
    filter(dispo_type == "Diversion" & startsWith(docket_number,"MC51CR")) %>%
    arrange(docket_entry_comments_collapse) %>%
    distinct(docket_number, listing_date, listing_type_cd, .keep_all = T) %>%
    arrange(listing_date) %>%
    # remove bail hearings and 404 hearings
    filter(!listing_type_cd %in% c("Preliminary Arraignment", "Arraignment","Bail Hearing",
                                    "Arraignment Preliminary Hearing","Emergency Release Hearing",
                                    "Early Bail Review")) %>%
    # note that this also removes 404 hearings at the end of cases, but that's ok here
    filter(!court_room_num == 404) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    # look at first post-bail, post-404 listing for each docket number to see if it's a diversion listing
    filter(listing_number == 1) %>%
    mutate(CU_divert = ifelse(listing_type_cd %in% c("Progress Listing","IP Status",
                                                       "TC - Progress Listing","DV Diversion Status",
                                                       "TC - Status","TC - Status Open",
                                                       "Summary Diversion Program",
                                                       "Status - Community Court","PDC Status",
                                                       "PDC Progress","DAP Status","DAP Progress")|
                                  grepl("AMP|ARD|Accele?ratr?ed Misde?m?ean?m?or|Program Accept|ard program|TCY",
                                        docket_entry_comments_collapse, ignore.case = T) |
                                  court_room_num %in% c("Broad and Champlost","Academy & Red Lion",
                                                        "Community Court","Mental Health Clinic",
                                                        "3901 Whitaker Avenue","55th and Pine Streets") , 1, 0))
  
  # get list of CU diversion cases
  cu_divert <- diverted %>%
    filter(CU_divert == 1)
  
  # mark CU diversions in main dataset and return
  df <- df %>%
    mutate(cu_divert = ifelse(docket_number %in% cu_divert$docket_number, 1, 0))
  
  return(df)
}


## Correct Diversion Dispositions
# Checks for diversion cases that were marked with "dismissed / withdrawn" disposition type. 
# Checks that those cases were successfully completed diversion; creates a new updated
# dispo_type variable that reflects this.
correctDiversionDispos <- function(df){
  
  # identify possible diversion cases
  # some of these are clearly failures (guilty pleas, etc.) but it's the dismissed / withdrawn 
  # cases that are concerning, since these may have been successful diversions (if they
  # show up as plea, etc., they either failed diversion or did not actually enter diversion)
  possible_diversion_cases <- df %>%
    filter(any(grepl("Community Court|ARC|ARD|PDC|Status of Admission|DV|Diversion|TC|MHC|",
                      listing_type_cd)), .by = docket_number) %>%  
    filter(dispo_type %in% c("Dismissed/Withdrawn/Etc","Withdrawn in the Interest of Justice")) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    select(docket_number,listing_date,listing_desc,listing_type_cd,listing_outcome,
            docket_entry_comments_collapse,official_docket_entry_collapse,
            dispo_type,case_disposition_type,listing_number)
  
  successes <- possible_diversion_cases %>%
    # look at last listing in case to see if it indicates diversion completion
    filter(listing_number == max(listing_number), .by = docket_number) %>%
    filter(grepl("conditions satisfied|(defendant )?completed|compliance",docket_entry_comments_collapse))
  
  # Update disposition for the cases that were successes:
  df <- df %>%
    mutate(dispo_type_update = as.factor(ifelse(docket_number %in% successes$docket_number,
                                                  "Diversion",as.character(dispo_type))))
  return(df)
}



## Get 404 Variable
# Creates a category called "404 Withdrawn" for cases that are sent to 
# 404 to be withdrawn / dismissed / etc. 
get404Vars <- function(df){
  
  # identify cases that went to a MC trial room
  went_mc <- df %>%
    filter(any(court_room_num %in% c("403","406","503","506","603","606","703","706","803","806","903","906","1103")), 
           .by = docket_number)
  
  # identify cases that ended in room 404 without going to MC trial first
  end404 <- df %>%
    arrange(docket_number, listing_date) %>%
    mutate(listing_number = row_number(), .by = docket_number) %>%
    filter(listing_number == max(listing_number), .by = docket_number) %>%
    filter(court_room_num == 404 & !docket_number %in% went_mc$docket_number)
  
  df <- df %>%
    mutate(dispo_type_update = case_when(docket_number %in% end404$docket_number &
                                             grepl("Withdrawn|Diversion", dispo_type) ~ "404 Withdrawn",
                                         grepl("Withdrawn in the Interest", dispo_type) ~"Dismissed/Withdrawn/Etc",
                                         TRUE ~ dispo_type))
  return(df)
}

## Get sentences from DAOCMS
getSentencesDAOCMS <- function(df){
  
  sentences <- load_case_listings_with_sentences() %>%
    select(docket_number, has_probation, has_incarceration) %>%
    mutate(any_probation = ifelse(any(has_probation) == TRUE, 1, 0),
           any_incarceration = ifelse(any(has_incarceration) == TRUE, 1, 0), .by = docket_number) %>%
    distinct(docket_number, .keep_all = TRUE) %>%
    mutate(any_probation = ifelse(is.na(any_probation), 0, any_probation),
           any_incarceration = ifelse(is.na(any_incarceration), 0, any_incarceration), .by = docket_number) %>%
    select(docket_number, any_probation, any_incarceration) %>%
    mutate(probation_only = ifelse(any_probation == 1 & any_incarceration == 0, 1, 0), .by = docket_number) %>%
    mutate(punished = ifelse(any_probation == 1 | any_incarceration == 1, 1, 0), .by = docket_number)
  
  df <- df %>%
    left_join(., sentences) %>%
    mutate(conviction = ifelse(any(dispo_type %in% c("Guilty","Guilty Plea/Nolo")), 1, 0), .by = docket_number) %>%
    mutate(convicted_no_sentence = ifelse(conviction == 1 & punished == 0, 1, 0))
  
  return(df)
}


## Get Linked Arrests and Cases
# gets a dataframe of linked arrests and cases with variables we need
# Note: multiple arrest_ids can go to the same dc_pid! Some dockets have two arrest_ids etc.
# So this dataset is at the docket_dc_pid level
getLinkedArrestsCases <- function(){
  
  load_arrests()
  
  cases <- load_cases() %>%
    select(docket_number, arrest_date, dc_number, defendant_pid, otn, case_grade,
           case_open_date, disposition_type, disposition_date, offense_category) %>%
    filter(!is.na(defendant_pid) & !is.na(dc_number)) %>%
    # join cases and PARS by pid and dc_number
    full_join(., pars_arrests %>% select(arrest_id, dc_number, defendant_pid, arrest_date) %>%
                rename(arrest_date_pars = arrest_date), 
              by = c("dc_number","defendant_pid")) %>%
    filter(!is.na(docket_number)) 
  
  return(cases)
}



## Get Dockets Dataset
# Uses docket entries / case listings for misdemeanor cases to identify diversions,
# 404 dispositions, etc.
# Returns a dataframe of disposition info for cases that are in misd_arrests_cases df
getDispositionsDf <- function(misd_arrests_cases){
  
  df <- readRDS("/srv/data/penn/court_actor_FTA/CLDEmerged.rds") %>%
    # keep just the misdemeanor cases we have in our arrests dataset from complaint charge table
    filter(!is.na(docket_number)) %>%
    filter(docket_number %in% misd_arrests_cases$docket_number) %>%
    # collapse any docket entry notes to hearing level
    collapseDocketEntries(.) %>% 
    # mark charging diversions
    markChargingDiversions(.) %>%
    correctDiversionDispos(.) %>%
    get404Vars(.) %>%
    getSentencesDAOCMS(.) %>%
    select(docket_number, cu_divert, dispo_type, dispo_type_update, case_disposition_type,
           conviction, convicted_no_sentence, punished, probation_only, any_probation, any_incarceration) %>%
    distinct(docket_number, .keep_all = TRUE) %>%
    # mark cases not yet disposed
    mutate(dispo_type = ifelse(is.na(dispo_type),"Not Yet Disposed",dispo_type)) %>%
    mutate(dispo_type_update = ifelse(is.na(dispo_type_update),"Not Yet Disposed",dispo_type_update))
  
  return(df)
}


# Build dataset of misdemeanor arrests------------------------------------------

# get charging data from Jan 2011 through March 1, 2020 (takes awhile to run)
cc <- getChargesCleaned()

# Link charging data to court case data
full_cases_charges <- cc %>%
  distinct(arrest_id, dc_number, defendant_pid, .keep_all = TRUE) %>%
  select(arrest_id, dc_number, defendant_pid, arrest_date_pars, declined, cc_lead_charge_arrest,
          cc_lead_charge_charging, defendant_age) %>%
  left_join(., getLinkedArrestsCases(), by = c("dc_number","defendant_pid", "arrest_id","arrest_date_pars"))

# Pull out arrests that were not matched to PARS / DAOCMS 
full_cases_charges %>%
  filter(is.na(docket_number)) %>%
  select(arrest_id, dc_number, defendant_pid, arrest_date_pars, declined, cc_lead_charge_arrest,
          cc_lead_charge_charging) %>%
  filter(cc_lead_charge_arrest == "M" & declined == 0) %>%
  distinct(arrest_id, .keep_all = T) %>%
  summarize(count = n())

# Get final dataset of misdemeanor arrests
misd_arrests_cases <- full_cases_charges %>%
  filter(cc_lead_charge_arrest == "M") %>%
  # keep just MC dockets (all start in MC, even if appealed to CP) and declinations
  filter(declined == 1 | startsWith(docket_number,"MC51CR")) %>%
  # merge on case information and create needed variables
  left_join(., getDispositionsDf(.), by = "docket_number") %>%
  mutate(wd_404 = ifelse(dispo_type_update == "404 Withdrawn", 1, 0)) %>%
  mutate(to_mc = ifelse(wd_404 == 0 & cu_divert == 0, 1, 0)) %>%
  mutate(dispo_type_update = case_when(dispo_type_update == "Diversion" & cu_divert == 0 ~ "MC Diversion",
                                       dispo_type_update == "Dismissed/Withdrawn/Etc" ~ "Withdrawn or Dismissed",
                                       dispo_type_update == "Diversion" ~ "CU Diversion",
                                       dispo_type_update %in% c("Exonerated/Won on Appeal",
                                                                "Not Guilty/Acquittal",
                                                                "Guilty") ~ "Trial",
                                       disposition_type == "* NA *" & !is.na(disposition_date) ~ "Missing Disposition",
                                       TRUE ~ dispo_type_update)) %>%
  mutate(punishment_cat = case_when(any_incarceration == 1 ~ "Incarceration",
                                  probation_only == 1 ~ "Probation Only",
                                  convicted_no_sentence == 1 ~ "Convicted No Sentence",
                                  TRUE ~ "Not Convicted"))

# Get number of arrests downgraded to a Summary Lead charge case
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0) %>%
  mutate(summary_downgrade = ifelse(cc_lead_charge_charging == "S", 1, 0)) %>%
  filter(summary_downgrade == 1) %>%
  summarize(count = n())


# Get overall percents at each node --------------------------------------------

# Get total number of arrests
total_arrests <- length(unique(misd_arrests_cases$arrest_id))

# Get total number of police lead charges
cc %>%
  filter(arrest_id %in% misd_arrests_cases$arrest_id) %>%
  filter(is_charge_original_police_charge == T) %>%
  summarize(count = n())

# Declinations Node
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  summarize(decl = mean(declined),
            n_decl = sum(declined))

# CU Diverted
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0) %>%
  summarize(n_divert = sum(cu_divert, na.rm = T)) %>%
  mutate(cu_divert_total = n_divert/total_arrests)

# 404 Withdrawn
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0) %>%
  summarize(n_wd_404 = sum(wd_404, na.rm = T)) %>%
  mutate(wd_404_total = n_wd_404/total_arrests)

# Proceed to MC Node
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0) %>%
  summarize(n_to_mc = sum(to_mc, na.rm = T)) %>%
  mutate(to_mc_total = n_to_mc/total_arrests)

# Non-trial Disposition Node
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0 & to_mc == 1) %>%
  summarize(count = n(), .by = dispo_type_update) %>%
  mutate(percents = count/total_arrests)

# Trial Node
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(declined == 0 & to_mc == 1 & dispo_type_update == "Trial") %>%
  summarize(count = n(), .by = dispo_type) %>%
  mutate(percents = count/total_arrests)

# Formal Punishment Node
misd_arrests_cases %>%
  distinct(arrest_id, .keep_all = TRUE) %>%
  filter(dispo_type_update == "Trial" & dispo_type == "Guilty") %>%
  summarize(count = n(), .by = punishment_cat) %>%
  mutate(percents = count/total_arrests)
